* Settings
version 16
do "$SSDIMed/scripts/_auxiliary/_project_settings.do"

* Run as batch; auto-creates *.log file with same base name as the *.do file
* > statamp -b 03a_build_analysis_samples.do & 

* -----------------------------------------------------------------------------------------------
* Create master panel that will form the basis for analysis
*   runtime: 1 hour 
* -----------------------------------------------------------------------------------------------

* Master panel
use "$SSDIMed/data/proc/medicare/master_cohort_enroll.dta", clear

* Drop variables not being used in analysis
drop file_vintage_init bene_zip5* fipscounty_init fipscounty covstart_init covstart_count mbsf_year_init bene_dob crnt_bic

* Generate male indicator from sex variable
label list sex
assert "Male":sex == 1
gen male_init = (sex_init == "Male":sex)
drop sex_init
label var male_init "Sex = Male, 1st year observed in Medicare elig. files"

* Add other covstart date variables
gen covstart_month = mofd(covstart_fill)
gen covstart_year = year(covstart_fill)
gen years_since_covstart = rfrnc_yr - year(covstart_fill)
format covstart_month %tm
label var covstart_month "mofd(covstart_fill)"
label var covstart_year "year(covstart_fill)"
label var years_since_covstart "Years since Medicare coverage started = rfrnc_yr - year(covstart_fill)"
order covstart_month covstart_year years_since_covstart, after(covstart_fill)

* Cleaning: drop if someone is ever observed in Medicare prior to the year of their recorded Medicare coverage start
gegen flag_seen_early = max(years_since_covstart < 0), by(bene_id)
qui glevelsof bene_id if flag_seen_early == 1
assert r(J) == 13
drop if flag_seen_early == 1
drop flag_seen_early
assert inrange(years_since_covstart, 0, 24)

* Add unemployment in months of SSDI filing
gen fipscounty = fipscounty_firstnm
merge m:1 fipscounty covstart_month using "$SSDIMed/data/proc/public/unemp_pop_atapp.dta", keep(master match) noreport keepusing(unemp*atapp)
drop fipscounty
gen byte in_unemp_pop = (_merge == 3)
label var in_unemp_pop "Observation matches into the unemp_pop_atapp data"
drop _merge 

* Add age in year of observation
gen age = rfrnc_yr - year(bene_dob_init)
label var age "Age at end of year = rfrnc_yr - year(bene_dob_init)"
order bene_id rfrnc_yr age age_????_covstart_fill age_init race_init male_init bene_dob_init 
assert age >= age_year_covstart_fill

* QC
assert rfrnc_yr >= init_yr
assert inrange(age_year_covstart_fill, 20, 64)

* Temp save base panel file
tempfile base_panel
save `base_panel'

* Prepare annual spending panel
if 1 {
  use "$SSDIMed/data/proc/medicare/master_cohort_enroll_pmt.dta", clear
  assert rfrnc_yr >= 1999
  
  * Adjust medial spending for inflation (Medical care in U.S. city avg, all urban consumers, seas. adj. (ref: 2017m12))
  gen yearmonth = ym(rfrnc_yr, 1)
  merge m:1 yearmonth using "$SSDIMed/data/proc/bls/cpi/bls_cpi_us_monthly.dta", assert(match using) keep(match) nogen noreport keepusing(cpi_med_us_yyyy)
  drop yearmonth
  foreach var in tot_pmt ptb_pmt other_pmt {
    local lab: variable label `var'
    replace `var' = `var' / cpi_med_us_yyyy
    label var `var' "(CPI-adjusted) `lab'"
  }
  drop cpi_med_us_yyyy
  
  * Temp save cost and use panel file
  tempfile cost_use_panel
  save `cost_use_panel'
}

* Load base panel file
use `base_panel', clear
qui desc, varlist
local base_vars = "`r(varlist)'"

* Merge in cost and use panel file
merge 1:1 bene_id rfrnc_yr using `cost_use_panel', keep(match master)
assert _merge == "master only (1)":_merge if rfrnc_yr < 1999
drop _merge in_mbsf_cu

* List of cost and use variables
qui desc, varlist
local all_vars = "`r(varlist)'"
local cu_vars : list all_vars - base_vars
di "`cu_vars'"

* Set cost and use variables to missing for people not enrolled in FFS only in the year of observation
assert inlist(ffs_only, 0, 1)
foreach var of local cu_vars {
  replace `var' = . if ffs_only == 0
}

* Generate raw mortality variable
gen died = year(death_dt_firstnm) == rfrnc_yr
label var died "Indicator for death in rfrnc_yr"

* Generate adjusted mortality variable, analog to CPI-U adjusted spending (ref: 2017)
preserve
  * ---------------------------
  * Load CMF data for ages 20-84, by year (1979-1998)
  import delimited "$SSDIMed/data/raw/wonder/Compressed Mortality, 1979-1998.txt", clear 
  
  * Cleanup
  drop if !missing(notes)
  drop yearcode notes cruderate
  
  * QC
  gisid year
  sum year 
  assert r(min) == 1979 & r(max) == 1998
  assert !missing(year, deaths, population)
  confirm numeric variable year deaths population
  
  * Deaths per 100,000
  gen mortality_rate = deaths / population * 100000
  label var mortality_rate "Deaths per 100,000"
  
  * Save
  tempfile cmf_data
  save `cmf_data'
  
  * ---------------------------
  * Load UCD data for ages 20-84, by year (1999-2019)
  import delimited "$SSDIMed/data/raw/wonder/Underlying Cause of Death, 1999-2019.txt", clear 
  
  * Cleanup
  drop if !missing(notes)
  drop yearcode notes cruderate
  
  * QC
  gisid year
  sum year 
  assert r(min) == 1999 & r(max) == 2019
  assert !missing(year, deaths, population)
  confirm numeric variable year deaths population
  
  * Deaths per 100,000
  gen mortality_rate = deaths / population * 100000
  label var mortality_rate "Deaths per 100,000"
  
  * Save
  tempfile ucd_data
  save `ucd_data'
  
  * ---------------------------
  * Combine all years, calculate adjustment factor
  use `cmf_data', clear
  append using `ucd_data'
  gisid year
  
  sum mortality_rate if year == 2017
  gen adj_mort_us_20_84_yyyy = mortality_rate / r(mean)
  label var adj_mort_us_20_84_yyyy "Mortality in US, ages 20-84, adjusted (ref: 2017)"
  assert adj_mort_us_20_84_yyyy == 1 if year == 2017
  
  rename year rfrnc_yr
  keep rfrnc_yr mortality_rate adj_mort_us_20_84_yyyy
  tempfile adj_mort
  save `adj_mort'
restore
merge m:1 rfrnc_yr using `adj_mort', assert(match using) keep(match) nogen noreport keepusing(adj_mort_us_20_84_yyyy) 
gen died_adj = died / adj_mort_us_20_84_yyyy
label var died_adj "(Yearly-adjusted) Indicator for death in rfrnc_yr"
drop adj_mort_us_20_84_yyyy

* Generate flags for original reason for entitlement
drop orec_init
* bys bene_id (rfrnc_yr): gen esrd_init = inlist(esrd[1], 1)
* bys bene_id (rfrnc_yr): gen orec_dib_only_init = inlist(orec[1], 1)
* bys bene_id (rfrnc_yr): gen orec_dib_any_init  = inlist(orec[1], 1, 3)
* gegen orec_dib_only_ever  = max(inlist(orec, 1)), by(bene_id)
gegen orec_dib_any_ever  = max(inlist(orec, 1, 3)), by(bene_id)
* label var esrd_init "Indicator for Medicare entitlement due to ESRD, initial observation"
* label var orec_dib_only_init "Original Medicare entitlement reason is DIB alone, initial observation"
* label var orec_dib_any_init "Original Medicare entitlement reason is DIB or DIB+ESRD, initial observation"
* label var orec_dib_only_ever "Original Medicare entitlement reason is DIB alone, any observation"
label var orec_dib_any_ever "Original Medicare entitlement reason is DIB or DIB+ESRD, any observation"

* Flag people whose initial BIC indicates an OREC of DIB*
*   Determine the "DIB" BIC codes based on benes who join the sample in a year where BIC codes are available.
bys bene_id (rfrnc_yr): gen first_obs = _n==1
assert missing(crnt_bic_init) == inrange(init_yr, 2002, 2005)
gegen frac_orec_dib_any_ever = mean(orec_dib_any_ever) if !inrange(init_yr, 2002, 2005) & first_obs, by(crnt_bic_init)
gegen frac_orec_dib_any_ever = max(frac_orec_dib_any_ever), by(bene_id) replace
assert missing(frac_orec_dib_any_ever) == inrange(init_yr, 2002, 2005)

* Example: those with initial BIC of "A" have DIB coverage 99% of the time
sum orec_dib_any_ever if !inrange(init_yr, 2002, 2005) & first_obs & (crnt_bic_init == "A")
assert `r(mean)' > 0.989
gstats tab orec_dib_any_ever if first_obs, by(crnt_bic_init)
count if inrange(frac_orec_dib_any_ever, 0.95, 1)

* "DIB" BIC codes: those associated with ORECs of DIB* 98% of the time or more
gen dib_bic_init = inrange(frac_orec_dib_any_ever, 0.98, 1)
label var dib_bic_init "Initial BIC code is associated with ORECs of DIB* in at least 98% of cases"
drop frac_orec_dib_any_ever
drop first_obs

* DIB sample: defined to be those with a "DIB" BIC code 
gen dib_sample = inlist(1, dib_bic_init, orec_dib_any_ever)
label var dib_sample "Indictor for in DIB sample, = inlist(1, dib_bic_init, orec_dib_any_ever)"
bys bene_id (rfrnc_yr): assert dib_sample == dib_sample[1]

* How many years (cumulative) of having orec indicating dib?
* by bene_id (rfrnc_yr): gen years_orec_dib = sum(inlist(orec, 1, 3))
* label var years_orec_dib "Years (to-date) with DIB as an original reason for entitlement"
* assert years_orec_dib == 0 if !orec_dib_any_ever
* by bene_id (rfrnc_yr): assert years_orec_dib[_N] > 0 if orec_dib_any_ever

* Save master analysis file
bys bene_id rfrnc_yr: assert _N == 1
compress
save "$SSDIMed/data/temp/master_cohort_panel.dta", replace

* Random 10% subset of benes from analysis file, for testing
set seed 0
gegen double benenum = group(bene_id)
keep if inrange(mod(benenum, 100), 0, 9)
preserve
drop benenum
save "$SSDIMed/data/temp/master_cohort_panel10p.dta", replace
restore

* Random 1% subset of benes from analysis file, for testing
keep if mod(benenum, 100) == 0 
drop benenum
save "$SSDIMed/data/temp/master_cohort_panel1p.dta", replace


** EOF


